View Javadoc
1   package edu.jiangxin.apktoolbox.pdf;
2   
3   import com.itextpdf.io.image.ImageDataFactory;
4   import com.itextpdf.kernel.geom.PageSize;
5   import com.itextpdf.kernel.pdf.PdfDocument;
6   import com.itextpdf.kernel.pdf.PdfReader;
7   import com.itextpdf.kernel.pdf.PdfWriter;
8   import com.itextpdf.layout.Document;
9   import com.itextpdf.layout.element.AreaBreak;
10  import com.itextpdf.layout.element.Image;
11  import com.itextpdf.layout.properties.HorizontalAlignment;
12  import org.apache.commons.io.IOUtils;
13  import org.apache.logging.log4j.LogManager;
14  import org.apache.logging.log4j.Logger;
15  import org.apache.pdfbox.Loader;
16  import org.apache.pdfbox.pdmodel.PDDocument;
17  import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
18  import org.apache.pdfbox.pdmodel.PDPage;
19  import org.apache.pdfbox.pdmodel.PDPageTree;
20  import org.apache.pdfbox.text.PDFTextStripper;
21  
22  import javax.imageio.ImageIO;
23  import java.awt.image.BufferedImage;
24  import java.io.File;
25  import java.io.FileOutputStream;
26  import java.io.IOException;
27  import java.util.Set;
28  
29  public class PdfUtils {
30      private static final Logger LOGGER = LogManager.getLogger(PdfUtils.class.getSimpleName());
31      public static boolean isScannedPdf(File file, int threshold) {
32          int length = 0;
33  
34          try (PDDocument document = Loader.loadPDF(file)) {
35              boolean isEncrypted = document.isEncrypted();
36              if (isEncrypted) {
37                  document.setAllSecurityToBeRemoved(true);
38              }
39  
40              PDFTextStripper stripper = new PDFTextStripper();
41              String text = stripper.getText(document).trim();
42              length = text.length();
43          } catch (IOException e) {
44              LOGGER.error("Error reading PDF file: {}", e.getMessage());
45              return false;
46          }
47          LOGGER.info("Processing file: {}, text size: {}", file.getPath(), length);
48          return length < threshold;
49      }
50  
51      public static boolean isEncryptedPdf(File file) {
52          boolean isEncrypted;
53  
54          try (PDDocument document = Loader.loadPDF(file)) {
55              isEncrypted = document.isEncrypted();
56          } catch (IOException e) {
57              LOGGER.error("Error reading PDF file: {}", e.getMessage());
58              return false;
59          }
60          LOGGER.info("Processing file: {}, is encrypted: {}", file.getPath(), isEncrypted);
61          return isEncrypted;
62      }
63  
64      public static boolean isNonOutlinePdf(File file) {
65          boolean hasOutline = false;
66  
67          try (PDDocument document = Loader.loadPDF(file)) {
68              boolean isEncrypted = document.isEncrypted();
69              if (isEncrypted) {
70                  document.setAllSecurityToBeRemoved(true);
71              }
72  
73              if (document.getDocumentCatalog() != null && document.getDocumentCatalog().getDocumentOutline() != null) {
74                  hasOutline = true;
75              }
76          } catch (IOException e) {
77              LOGGER.error("Error reading PDF file: {}", e.getMessage());
78              return false;
79          }
80          LOGGER.info("Processing file: {}, has outline: {}", file.getPath(), hasOutline);
81          return !hasOutline;
82      }
83  
84      public static boolean hasAnnotations(File file) {
85          boolean hasAnnotations = false;
86  
87          try (PDDocument document = Loader.loadPDF(file)) {
88              boolean isEncrypted = document.isEncrypted();
89              if (isEncrypted) {
90                  document.setAllSecurityToBeRemoved(true);
91              }
92              PDDocumentCatalog catalog = document.getDocumentCatalog();
93              if (catalog == null) {
94                  return false;
95              }
96              PDPageTree pages = document.getDocumentCatalog().getPages();
97              if (pages == null || pages.getCount() == 0) {
98                  return false;
99              }
100 
101             for (PDPage page : pages) {
102                 if (page.getAnnotations() != null && !page.getAnnotations().isEmpty()) {
103                     int pageNumber = page.getCOSObject().getInt("PageNumber", 0);
104                     String subType = page.getAnnotations().get(0).getSubtype();
105                     LOGGER.info("Found annotations on page: {}, subType: {}", pageNumber, subType);
106                     if (!subType.equals("Link")) {
107                         hasAnnotations = true;
108                         break; // No need to check further if we found annotations
109                     }
110                 }
111             }
112         } catch (IOException e) {
113             LOGGER.error("Error reading PDF file: {}", e.getMessage());
114             return hasAnnotations;
115         }
116         LOGGER.info("Processing file: {}, has annotations: {}", file.getPath(), hasAnnotations);
117         return hasAnnotations;
118     }
119 
120     public static void removePassword(File encryptedFile, File targetDir) {
121         try (PDDocument document = Loader.loadPDF(encryptedFile)) {
122             boolean isEncrypted = document.isEncrypted();
123             if (isEncrypted) {
124                 document.setAllSecurityToBeRemoved(true);
125             }
126             String targetFilePath = targetDir.getAbsolutePath() + File.separator + encryptedFile.getName();
127             document.save(targetFilePath);
128             LOGGER.info("Remove password success: {}", targetFilePath);
129         } catch (IOException e) {
130             LOGGER.error("Error processing PDF file: {}", e.getMessage());
131         }
132     }
133 
134     public static void removePasswordWithIText(File encryptedFile, File targetDir) {
135         String targetFilePath = targetDir.getAbsolutePath() + File.separator + encryptedFile.getName();
136         PdfReader reader = null;
137         PdfDocument pdfDoc = null;
138         PdfWriter writer = null;
139         try {
140             reader = new PdfReader(encryptedFile);
141             reader.setUnethicalReading(true);
142             writer = new PdfWriter(targetFilePath);
143             pdfDoc = new PdfDocument(reader, writer);
144         } catch (IOException e) {
145             LOGGER.error("Error processing PDF file: {}", e.getMessage());
146         } finally {
147             IOUtils.closeQuietly(writer);
148             IOUtils.closeQuietly(pdfDoc);
149             IOUtils.closeQuietly(reader);
150         }
151     }
152 
153     public static int getPageCount(File file) {
154         int pageCount = 0;
155 
156         try (PDDocument document = Loader.loadPDF(file)) {
157             boolean isEncrypted = document.isEncrypted();
158             if (isEncrypted) {
159                 document.setAllSecurityToBeRemoved(true);
160             }
161             pageCount = document.getNumberOfPages();
162         } catch (IOException e) {
163             LOGGER.error("Error reading PDF file: {}", e.getMessage());
164             return 0;
165         }
166         LOGGER.info("Processing file: {}, page count: {}", file.getPath(), pageCount);
167         return pageCount;
168     }
169 
170     public static void imagesToPdf(Set<File> images, File targetFile) {
171         PdfDocument pdfDoc = null;
172         PdfWriter writer = null;
173         Document doc = null;
174         try {
175             writer = new PdfWriter(new FileOutputStream(targetFile));
176             pdfDoc = new PdfDocument(writer);
177             doc = new Document(pdfDoc);
178 
179             for (File img : images) {
180                 BufferedImage bufferedImage = ImageIO.read(img);
181                 float width = bufferedImage.getWidth();
182                 float height = bufferedImage.getHeight();
183 
184                 PageSize pageSize = new PageSize(width, height);
185                 pdfDoc.addNewPage(pageSize);
186 
187                 Image image = new Image(ImageDataFactory.create(img.getAbsolutePath()));
188                 image.setFixedPosition(pdfDoc.getNumberOfPages(), 0, 0, width);
189 
190                 doc.setMargins(0, 0, 0, 0);
191                 doc.add(image);
192             }
193         } catch (IOException e) {
194             LOGGER.error("Error processing PDF file: {}", e.getMessage());
195         } finally {
196             IOUtils.closeQuietly(doc);
197             IOUtils.closeQuietly(pdfDoc);
198             IOUtils.closeQuietly(writer);
199         }
200     }
201 }